Directory definition
workingDir <- getwd()
dataDir <- file.path(workingDir, "Data")
resultsDir <- file.path(workingDir,"Out")
getwd()
## [1] "/home/rstudio/NEOPLASY_PRIMATES"
Seed and library loading
set.seed(1998)
# General use libraries
library(phytools)
library(dplyr)
library(castor)
library(tidyr)
# Plotting
library(ggplot2)
library(ggtree)
# Colors
library(microViz)
Tree import
primates_tree <- read.newick(file.path(dataDir,"233-GENOMES/science.abn7829_data_s4.nex.tree"))
# Give a name to species name attribute
tree_species <- primates_tree$tip.label
Cancer traitfile import
cancer_traits <- read.csv("Data/Neoplasia/species360_primates_neoplasia_20230519.csv", sep = ",")
# Remove whitespaces
cancer_traits[] <- lapply(cancer_traits, function(col) trimws(col))
# binarize
cancer_traits$label <- gsub(" ", "_", cancer_traits$species)
# Reorder and filter species
cancer_traits <- cancer_traits %>%
select(family,species,label,common_name,necropsy_count,neoplasia_necropsy,neoplasia_prevalence, everything()) %>%
filter(!is.na(neoplasia_prevalence))
Cancer trait file polishing
Remove spaces at the end, work out name dissimilarities and stuff
like that
common_names <- intersect(primates_tree$tip.label, cancer_traits$label)
direct_matches <- cancer_traits$label %in% tree_species
cancer_species_to_check <- cancer_traits$label[!direct_matches]
tree <- drop.tip(primates_tree, setdiff(primates_tree$tip.label, common_names))
### Mantain structure
cancer_traits <- cancer_traits %>%
mutate(ordering = match(label, tree$tip.label)) %>%
arrange(ordering) %>%
filter(!is.na(ordering))
write.tree(tree, file="Out/1.Pruning/tree_pruning/science.abn7829_data_s4.nex.tree.pruned")
Dissimilarities between the two namesets (tree and traits)
Clustering by branch depth
Revised to do the exploration node-wise
# Get distances from all nodes to root
sorted_distances <- sort(get_all_distances_to_root(tree))
# Identify duplicates rounded to 5 decimal places (for significance)
duplicates <- duplicated(round(unlist(sorted_distances), 5))
# Remove duplicates
unique_distances <- sorted_distances[!duplicates]
# Cut the tree using TreeCluster (bash)
for (index in seq_along(unique_distances)) {
node <- unique_distances[[index]]
# Visualice the node
# echo_command <- sprintf("echo %f", node)
# system(echo_command)
# Cut the tree by every possible nodal distance and collect the clusters in separated files for further use
output_path <- sprintf("Out/2.Clustering/tree_clustering/%d_cut_%f.clusters", index, node)
tree_cluster_command <- sprintf("python3 TreeCluster/TreeCluster.py -i Out/1.Pruning/tree_pruning/science.abn7829_data_s4.nex.tree.pruned -t %f --method root_dist -o %s", node, output_path)
system(tree_cluster_command)
}
Now lets merge all the clusters into a single dataframe, including
also the families
# Identify all cluster files
cluster_files <- list.files(path = "Out/2.Clustering/tree_clustering/", full.names = TRUE)
# Function to read and process a single cluster file
process_cluster_file <- function(file_path) {
# Extract just the filename without the extension
filename_no_ext <- tools::file_path_sans_ext(basename(file_path))
# Extract the desired portion for the header by splitting on '_'
components <- strsplit(filename_no_ext, "_")[[1]]
cut_point <- paste0(components[length(components) - 1], "_", tail(components, 1))
# Read the file into a dataframe
df <- read.csv(file_path, sep = "\t", stringsAsFactors = FALSE)
# Rename columns
colnames(df)[colnames(df) == "SequenceName"] <- "label"
colnames(df)[colnames(df) == "ClusterNumber"] <- cut_point
return(df)
}
# Process all cluster files
list_of_dfs <- lapply(cluster_files, process_cluster_file)
# Merge all data frames by SequenceName
clusters_df <- Reduce(function(x, y) {
merge(x, y, by = "label", all.y = TRUE, no.dups = TRUE)
}, list_of_dfs)
# Keep the SPECIES_BINOMIAL column fixed and order the rest
ordered_columns <- c("label", rev(colnames(clusters_df)[-1]))
# Reorder the columns based on the sorted column names
clusters_df <- clusters_df[, ordered_columns]
# Remove duplicated cuts
## This removes redundancy, as different, very close cuts have exactly the same clusters
clusters_unique <- clusters_df[!duplicated(t(clusters_df))]
# Add families from traitfile
merged_traits <- merge(clusters_unique, cancer_traits, by="label", all.x=TRUE)
clusters_unique <- cbind(clusters_unique[,1], merged_traits$family, clusters_unique[,-1])
colnames(clusters_unique)[1] <- "species"
colnames(clusters_unique)[2] <- "family"
# Recover the cluster file por posterior use
write.csv(clusters_unique, file="Out/2.Clustering/traits_clusters_exploration/clusters_unique.csv")
Create expanded clusters file for further analyses
In order to include species which are not present in the phylogeny
but might be interesting to see how the phenotype is distributed, add
them to an expanded data frame, replicating clustering data from closest
individual
clusters_unique_expanded <- clusters_unique
# Extract the genus from SPECIES_BINOMIAL
clusters_unique_expanded$GENUS <- sapply(strsplit(as.character(clusters_unique_expanded$species), "_"), `[`, 1)
genus_to_check <- sapply(strsplit(common_names, "_"), `[`, 1)
# Loop through cancer_species_to_check and match with merged_traits
for(name in cancer_species_to_check){
print(name)
# Find the genus of the species
genus <- strsplit(name, "_")[[1]][1]
# Check if the genus exists in merged_traits
matching_index <- which(clusters_unique_expanded$GENUS == genus)
# If a match is found, duplicate the row and modify the SPECIES_BINOMIAL
if(length(matching_index) > 0){
print(paste(name,"matches!"))
new_row <- clusters_unique_expanded[matching_index[1],]
new_row$species <- name
clusters_unique_expanded <- rbind(clusters_unique_expanded, new_row)
}
}
## [1] "Ateles_fusciceps"
## [1] "Ateles_fusciceps matches!"
## [1] "Cebus_capucinus"
## [1] "Macaca_sylvanus"
## [1] "Macaca_sylvanus matches!"
## [1] "Nomascus_leucogenys"
## [1] "Pithecia_pithecia"
## [1] "Plecturocebus_donacophilus"
## [1] "Saguinus_leucopus"
## [1] "Saguinus_leucopus matches!"
## [1] "Saimiri_boliviensis"
## [1] "Saimiri_boliviensis matches!"
## [1] "Symphalangus_syndactylus"
# Remove genus column
clusters_unique_expanded$GENUS <- NULL
# Recover the cluster file por posterior use
write.csv(clusters_unique_expanded, file="Out/2.Clustering/traits_clusters_exploration/clusters_unique_expanded.csv")
Visualizing cluster distribution with species present in the
phylogeny and trait values
# Extracting tip labels order from the phylo object
ordered_species <- rev(tree$tip.label)
# Create a new factor with the desired order for SPECIES_BINOMIAL
clusters_unique$species <- factor(clusters_unique$species, levels = ordered_species)
# Convert the data frame from wide to long format
clusters_long <- clusters_unique %>%
gather(key = "cut", value = "cluster", starts_with("cut"))
# Add a cut for FAMILY to visualize it alongside the other cuts
clusters_long <- rbind(clusters_long, data.frame(species = clusters_unique$species, cut = "FAMILY", cluster = as.character(clusters_unique$family), family = clusters_unique$family))
# Reorder the levels of the cut factor to place FAMILY first
ordered_cuts <- c("FAMILY", sort(unique(clusters_long$cut[clusters_long$cut != "FAMILY"])))
clusters_long$cut <- factor(clusters_long$cut, levels = ordered_cuts)
# Calculate the number of unique cluster values and families
num_clusters <- length(unique(clusters_long$cluster))
num_families <- length(unique(clusters_unique$family))
# Combine the colors and levels
all_colors <- distinct_palette(25)
all_levels <- unique(c(as.character(clusters_unique$family), as.character(clusters_long$cluster)))
# Plot
heatmap_plot <- ggplot(clusters_long, aes(x = cut, y = species)) +
geom_tile(aes(fill = cluster), color = "white", linewidth = 0.5) +
scale_fill_manual(values = setNames(all_colors, all_levels)) +
scale_x_discrete(expand = c(0,0)) +
labs(y = "Species", x = "Cut & Family", fill = "Value/Group") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, size = 14),
axis.text.y = element_text(size = 14, hjust = 0.5),
axis.title.x = element_text(size = 16, hjust = 0.5, margin = margin(t = 20, b = 10)),
axis.title.y = element_text(size = 16, angle = 90,hjust = 0.5, margin = margin(t = 20, b = 10)),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12))
heatmap_plot

ggsave(filename = "Out/2.Clustering/clustering_family_comparison/cluster_family_comp.svg", plot = heatmap_plot, width = 18, height = 18, device = "svg")
Visualizing cluster distribution with species not present in the
phylogeny
# Factorice and order by family
clusters_unique_expanded$family <- as.factor(sort(clusters_unique_expanded$family))
# Convert the data frame from wide to long format
clusters_expanded_long <- clusters_unique_expanded %>%
gather(key = "cut", value = "cluster", starts_with("cut"))
# Add a cut for FAMILY to visualize it alongside the other cuts
clusters_expanded_long <- rbind(clusters_expanded_long, data.frame(species = clusters_unique_expanded$species, cut = "FAMILY", cluster = as.character(clusters_unique_expanded$family), family = clusters_unique_expanded$family))
# Reorder the levels of the cut factor to place FAMILY first
ordered_cuts <- c("FAMILY", sort(unique(clusters_expanded_long$cut[clusters_expanded_long$cut != "FAMILY"])))
clusters_expanded_long$cut <- factor(clusters_expanded_long$cut, levels = ordered_cuts)
# Calculate the number of unique cluster values and families
num_clusters <- length(unique(clusters_expanded_long$cluster))
num_families <- length(unique(clusters_unique_expanded$FAMILY))
# Combine the colors and levels
all_colors <- distinct_palette(25)
all_levels <- unique(c(as.character(clusters_unique$family), as.character(clusters_long$cluster)))
# Plot
heatmap_expanded_plot <- ggplot(clusters_expanded_long, aes(x = cut, y = species)) +
geom_tile(aes(fill = cluster), color = "white", linewidth = 0.5) +
scale_fill_manual(values = setNames(all_colors, all_levels)) +
scale_x_discrete(expand = c(0,0)) +
labs(y = "Species", x = "Cut & Family", fill = "Value/Group") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, size = 14),
axis.text.y = element_text(size = 14, hjust = 0.5),
axis.title.x = element_text(size = 16, hjust = 0.5, margin = margin(t = 20, b = 10)),
axis.title.y = element_text(size = 16, angle = 90, hjust = 0.5, margin = margin(t = 20, b = 10)),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12))
print(heatmap_expanded_plot)

ggsave(filename = "Out/2.Clustering/clustering_family_comparison/cluster_family_comp_w_outspecies.svg", plot = heatmap_expanded_plot, width = 18, height = 18, device = "svg")
Plot together our heatmap with the phylogenetic tree
tree <- ape::rotateConstr(tree, ordered_species)
tree_plot <- ggtree(tree, ladderize = FALSE)
combined_plot <- tree_plot + heatmap_plot
# still same problemo
combined_plot

ggsave(filename = "Out/2.Clustering/clustering_family_comparison/cluster_family_comp_w_tree.png", plot = combined_plot, width = 36, height = 18, dpi = 300)
ggsave(filename = "Out/2.Clustering/clustering_family_comparison/cluster_family_comp_w_tree.svg", plot = combined_plot, width = 36, height = 18, device = "svg")